{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import joblib \n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "COL_NUM = 100\n",
    "raw_col = joblib.load('data_sel.lz4').columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "def timer(func):\n",
    "    def wrapper(*args,**kwargs):\n",
    "        start = time.time()\n",
    "        result = func(*args,**kwargs)\n",
    "        end = time.time()\n",
    "        print(func.__name__+'运行时间：',end-start)\n",
    "        return result\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])\n",
    "valid_raw = joblib.load('../../preprocess_data_new/valid_nodup.lz4').drop(columns=['id','loan_dt'])\n",
    "valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])\n",
    "valid_tag = pd.read_csv('../../2_feature_select_new/predict_tag_new/valid_tag.csv',usecols=['tag'])\n",
    "week_test = joblib.load('../../preprocess_data_discrete/week_test.lz4')\n",
    "\n",
    "valid = pd.concat([valid_date,valid_raw,valid_null,valid_tag,week_test],axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_sel = valid[raw_col]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./valid_data/valid_sel.lz4']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(valid_sel,'./valid_data/valid_sel.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_sel = joblib.load('./valid_data/valid_sel.lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['sel_cols']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(list(valid_sel.columns),'sel_cols')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## multiply"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed\n",
    "\n",
    "def one_cross(df,col_0):\n",
    "    data_cf = pd.DataFrame()\n",
    "    for col_1 in df.columns:\n",
    "        data_cf['{0}*{1}'.format(col_0,col_1)] = (df[col_0]*df[col_1]).values\n",
    "    return data_cf\n",
    "\n",
    "@timer\n",
    "def parallel_cross(df):\n",
    "    df_list = Parallel(n_jobs=32,verbose=10)(delayed(one_cross)(df,col_0) for col_0 in df.columns)\n",
    "    return pd.concat(df_list,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_sel = joblib.load('./valid_data/valid_sel.lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.\n",
      "[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed:    3.2s\n",
      "[Parallel(n_jobs=32)]: Done  21 tasks      | elapsed:    3.7s\n",
      "[Parallel(n_jobs=32)]: Done  32 out of  86 | elapsed:    4.2s remaining:    7.1s\n",
      "[Parallel(n_jobs=32)]: Done  41 out of  86 | elapsed:    4.6s remaining:    5.1s\n",
      "[Parallel(n_jobs=32)]: Done  50 out of  86 | elapsed:    5.1s remaining:    3.6s\n",
      "[Parallel(n_jobs=32)]: Done  59 out of  86 | elapsed:    5.5s remaining:    2.5s\n",
      "[Parallel(n_jobs=32)]: Done  68 out of  86 | elapsed:    5.9s remaining:    1.6s\n",
      "[Parallel(n_jobs=32)]: Done  77 out of  86 | elapsed:    6.3s remaining:    0.7s\n",
      "[Parallel(n_jobs=32)]: Done  86 out of  86 | elapsed:    6.6s remaining:    0.0s\n",
      "[Parallel(n_jobs=32)]: Done  86 out of  86 | elapsed:    6.6s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "parallel_cross运行时间： 9.12183427810669\n"
     ]
    }
   ],
   "source": [
    "valid_cf = parallel_cross(valid_sel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./valid_data/valid_cf_86.lz4']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(valid_cf,'./valid_data/valid_cf_{}.lz4'.format(valid_sel.shape[1]),compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输出valid_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_sel = joblib.load('./valid_data/valid_sel.lz4')\n",
    "valid_cf_86 = joblib.load('./valid_data/valid_cf_86.lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "cf_cols = joblib.load('./cf_cols')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_cf_sel = valid_cf_86[cf_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./valid_data/valid_cf_sel.lz4']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(valid_cf_sel,'./valid_data/valid_cf_sel.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_data = pd.concat([valid_sel,valid_cf_sel],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(20000, 234)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./valid_data/valid_data.lz4']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(valid_data,'./valid_data/valid_data.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## valid_cf_86_fs10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_cf_86 = joblib.load('./valid_data/valid_cf_86.lz4')\n",
    "cf_score = joblib.load('./cf_score')\n",
    "valid_cf_86_fs10 = valid_cf_86[cf_score[cf_score>10].index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./valid_data/valid_cf_86_fs10.lz4']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(valid_cf_86_fs10,'./valid_data/valid_cf_86_fs10.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
